//
// Project: Disagreement in science: Missing women



clear all
version 15.1  



//
// set locals

// method of identifying gender
local female "female_genderize"
local male "male_genderize"

// gender of author is known
local known_gender "female_genderize!=."




//
// AER

// call data
use "${data}/output/aer_data_gender.dta", clear
drop if month=="May" & year!=2019  // exclude AEA papers and proceedings
drop if year==2020  
keep if (comment | research_article)

// female representation for regular articles
local i=1
foreach field in microeconomics theory macroeconomics labor econometrics io international finance public health_urban development history lab other {
	sum `female' if comment==0 & field_`field'==1
	local fraction_female_`i' = r(mean)
	local i = `i'+1

}

// count articles rather than author-articles
keep if author_id==1  

// create dataset
tempname memhold
postfile `memhold' field_number fraction_articles fraction_comments fraction_female using "${data}/temporary/aer_fields.dta", replace

// total number of articles and comments
forvalues i=0/1 {
	sum article_id if comment==`i'
	local N_`i' = r(N)
}

// fractions
local i=1
foreach field in microeconomics theory macroeconomics labor econometrics io international finance public health_urban development history lab other {
	forvalues j=0/1 {
		sum article_id if field_`field'==1 & comment==`j'
		local N_`i'_`j' = r(N)
		local fraction_`i'_`j' = `N_`i'_`j'' / `N_`j''
	}
	post `memhold' (`i') (`fraction_`i'_0') (`fraction_`i'_1') (`fraction_female_`i'')
	local i=`i'+1
}
postclose `memhold'

// figure
use "${data}/temporary/aer_fields.dta", clear
generate field_number_shift = field_number + 0.3
generate midpoint = field_number + 0.15
twoway (bar fraction_articles field_number, barwidth(0.3) color(gs5%80)) (bar fraction_comments field_number_shift, barwidth(0.3) lcolor(gs6) fcolor(gs15%80)) (scatter fraction_female midpoint, yaxis(2) color(black) msymbol(S)), plotregion(style(none)) scheme(s1color) title("") xtitle("Field classification") ytitle("Share of articles in given field (bars)") xlabel(1.15 "1" 2.15 "2" 3.15 "3" 4.15 "4" 5.15 "5" 6.15 "6" 7.15 "7" 8.15 "8" 9.15 "9" 10.15 "10" 11.15 "11" 12.15 "12" 13.15 "13" 14.15 "14", noticks labsize(medsmall)) ylabel(0 .10 `"0.1"' .20 `"0.2"' .30 `"0.3"' .4 `"0.4"' .5 `"0.5"', grid angle(horizontal) labsize(medsmall)) yscale(r(., .5)) ylabel(0 .10 `"0.1"' .20 `"0.2"' .30 `"0.3"' .4 `"0.4"' .5 `"0.5"', grid angle(horizontal) labsize(medsmall) axis(2)) ytitle("Share of female authors in given field (squares)", axis(2)) legend(row(2) size(medsmall) order(1 "regural article" 2 "comment" 3 "share of female authors of regular articles") bmargin(l=0 r=0))
graph export "${output}/aer_fields.eps", replace
graph export "${output}/aer_fields.png", replace



//
// Nature

// call data
use "${data}/output/nature_data_gender.dta", clear
drop if year==2020
keep if comment | research_article

// merge with field information
merge m:1 article_id using "${data}/output/nature_fields_matched_ids.dta"
keep if _merge==3

// keep 2010-2019 period since field information is available for that period only
keep if year>=2010

// female representation for regular articles
local i=1
foreach var of varlist biological earth health physical social {
	sum `female' if comment==0 & `var'==1
	local fraction_female_`i' = r(mean)
	local i = `i'+1
}

// count articles rather than author-articles
keep if author_id==1  

// create dataset
tempname memhold
postfile `memhold' field fraction_articles fraction_comments fraction_female using "${data}/temporary/nature_fields.dta", replace

// total number of articles and comments
forvalues i=0/1 {
	sum article_id if comment==`i'
	local N_`i' = r(N)
}

// fractions
local i=1
foreach var of varlist biological earth health physical social {
	forvalues j=0/1 {
		sum article_id if `var'==1 & comment==`j'
		local N_`i'_`j' = r(N)
		local fraction_`i'_`j' = `N_`i'_`j'' / `N_`j''
	}
	post `memhold' (`i') (`fraction_`i'_0') (`fraction_`i'_1') (`fraction_female_`i'')
	local i=`i'+1
}

postclose `memhold'

// figure
use "${data}/temporary/nature_fields.dta", clear
generate field_shift = field + 0.3
generate midpoint = field + 0.15
twoway (bar fraction_articles field, barwidth(0.3) color(gs5%80)) (bar fraction_comments field_shift, barwidth(0.3) lcolor(gs6) fcolor(gs15%80)) (scatter fraction_female midpoint, yaxis(2) color(black) msymbol(S)), plotregion(style(none)) scheme(s1color) title("") xtitle("") ytitle("Share of articles in given field (bars)") xlabel(1.15 `""Biological" "sciences""' 2.15 `""Earth" "sciences""' 3.15 `""Health" "sciences""' 4.15 `""Physical" "sciences""' 5.15 `""Social" "sciences""', noticks labsize(medsmall)) ylabel(0 .15 `"0.15"' .30 `"0.30"' .45 `"0.45"' .6 `"0.60"' .75 `"0.75"', grid angle(horizontal) labsize(medsmall)) yscale(r(., .75)) ylabel(0 .15 `"0.15"' .30 `"0.30"' .45 `"0.45"' .6 `"0.60"' .75 `"0.75"', grid angle(horizontal) labsize(medsmall) axis(2)) ytitle("Share of female authors in given field (squares)", axis(2)) legend(row(2) size(medsmall) order(1 "regural article" 2 "comment" 3 "share of female authors of regular articles") bmargin(l=0 r=0))
graph export "${output}/nature_fields.eps", replace
graph export "${output}/nature_fields.png", replace



//
// PNAS 

// call data
use "${data}/output/pnas_data_gender.dta", clear
drop if full_name=="II" | full_name=="III" | full_name=="IV" | full_name=="Jr" | full_name=="Jr."  // erroneously scraped as separate author-article observations
drop if year==2020 | year<2008  // PNAS started comments in 2008
keep if comment | research_article

// generate field information for comments based on article-comment links
generate type_num = .
replace type_num = 1 if type=="Biological Sciences"
replace type_num = 2 if type=="Physical Sciences"
replace type_num = 3 if type=="Social Sciences"
local N = _N
forvalues i=1/`N' {
	if call_to[`i'] != . {
		local id = call_to[`i']
		sum type_num if article_id==`id'
		replace type_num = r(mean) if _n==`i'
	}
}

// female representation for regular articles
forvalues i=1/3 {
	sum `female' if comment==0 & type_num==`i'
	local fraction_female_`i' = r(mean)
}

// count articles rather than author-articles
keep if author_id==1  

// create dataset
tempname memhold
postfile `memhold' field fraction_articles fraction_comments fraction_female using "${data}/temporary/pnas_fields.dta", replace

// total number of articles and comments with observed field
forvalues i=0/1 {
	sum article_id if comment==`i' & type_num!=.
	local N_`i' = r(N)
}

// fractions
forvalues i=1/3 {
	forvalues j=0/1 {
		sum article_id if type_num==`i' & comment==`j'
		local N_`i'_`j' = r(N)
		local fraction_`i'_`j' = `N_`i'_`j'' / `N_`j''
	}
	post `memhold' (`i') (`fraction_`i'_0') (`fraction_`i'_1') (`fraction_female_`i'')
}
postclose `memhold'

// figure
use "${data}/temporary/pnas_fields.dta", clear
generate field_shift = field + 0.3
generate midpoint = field + 0.15
twoway (bar fraction_articles field, barwidth(0.3) color(gs5%80)) (bar fraction_comments field_shift, barwidth(0.3) lcolor(gs6) fcolor(gs15%80)) (scatter fraction_female midpoint, yaxis(2) color(black) msymbol(S)), plotregion(style(none)) scheme(s1color) title("") xtitle("") ytitle("Share of articles in given field (bars)") xlabel(1.15 "Biological sciences" 2.15 "Physical sciences" 3.15 "Social sciences", noticks labsize(medsmall)) ylabel(0 .15 `"0.15"' .30 `"0.30"' .45 `"0.45"' .6 `"0.60"' .75 `"0.75"', grid angle(horizontal) labsize(medsmall)) yscale(r(., .78)) ylabel(0 .15 `"0.15"' .30 `"0.30"' .45 `"0.45"' .6 `"0.60"' .75 `"0.75"', grid angle(horizontal) labsize(medsmall) axis(2)) ytitle("Share of female authors in given field (squares)", axis(2)) yscale(r(., .78) axis(2)) legend(row(2) size(medsmall) order(1 "regural article" 2 "comment" 3 "share of female authors of regular articles") bmargin(l=0 r=0))
graph export "${output}/pnas_fields.eps", replace
graph export "${output}/pnas_fields.png", replace

